# Import Required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Loading and examining the dataset
penguins_df = pd.read_csv("penguins.csv")
penguins_df.head()

penguins_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   culmen_length_mm   332 non-null    float64
 1   culmen_depth_mm    332 non-null    float64
 2   flipper_length_mm  332 non-null    float64
 3   body_mass_g        332 non-null    float64
 4   sex                332 non-null    object 
dtypes: float64(4), object(1)
memory usage: 13.1+ KB

# Convert categorical variables into dummy/indicator variables
penguins_df = pd.get_dummies(penguins_df, dtype='int')

# Scaling variables  
scaler = StandardScaler()
X = scaler.fit_transform(penguins_df)
penguins_preprocessed = pd.DataFrame(data=X,columns=penguins_df.columns)
penguins_preprocessed.head()

# Detect the optimal number of clusters for k-means clustering using elbow method
inertia = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(penguins_preprocessed)
    inertia.append(kmeans.inertia_)    
plt.plot(range(1, 10), inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

n_clusters=4

# Run the k-means clustering with k=4 
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(penguins_preprocessed)
penguins_df['label'] = kmeans.labels_

# visualize the clusters (here for the 'culmen_length_mm' column)
plt.scatter(penguins_df['label'], penguins_df['culmen_length_mm'], c=kmeans.labels_, cmap='viridis')
plt.xlabel('Cluster')
plt.ylabel('culmen_length_mm')
plt.xticks(range(int(penguins_df['label'].min()), int(penguins_df['label'].max()) + 1))
plt.title(f'K-means Clustering (K={n_clusters})')
plt.show()

# create `stat_penguins` DataFrame
numeric_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm','label']
stat_penguins = penguins_df[numeric_columns].groupby('label').mean()
stat_penguins

Column	Description
`culmen_length_mm`	Length of the penguin's bill (mm)
`culmen_depth_mm`	Depth of the penguin's bill (mm)
`flipper_length_mm`	Length of the flipper (mm)
`body_mass_g`	Body mass (grams)
`sex`	Penguin’s sex (Male/Female)

	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g	sex
0	39.1	18.7	181.0	3750.0	MALE
1	39.5	17.4	186.0	3800.0	FEMALE
2	40.3	18.0	195.0	3250.0	FEMALE
3	36.7	19.3	193.0	3450.0	FEMALE
4	39.3	20.6	190.0	3650.0	MALE

	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g	sex_FEMALE	sex_MALE
0	-0.903906	0.790360	-1.425342	-0.566948	-0.993994	0.993994
1	-0.830434	0.126187	-1.068577	-0.504847	1.006042	-1.006042
2	-0.683490	0.432728	-0.426399	-1.187953	1.006042	-1.006042
3	-1.344738	1.096901	-0.569105	-0.939551	1.006042	-1.006042
4	-0.867170	1.761074	-0.783164	-0.691149	-0.993994	0.993994

	culmen_length_mm	culmen_depth_mm	flipper_length_mm
label
0	43.878302	19.111321	194.764151
1	45.563793	14.237931	212.706897
2	40.217757	17.611215	189.046729
3	49.473770	15.718033	221.540984

Project Description¶

Background¶

Dataset Information¶

Our Task¶

Final Outcome¶

Final Thoughts:¶

Cluster 3 (Yellow):¶